import numpy as np
import pandas as pd
import pystan
import seaborn as sns
import matplotlib.pyplot as plt
import arviz as az
from src.stan_data_format import format_stan_data_logistic
from src.utils import pickle_model, load_model, make_latex_table, save_latex_table
from src.clean import clean_exp_2, clean_exp_1, clean_exp3
from multiprocessing import cpu_count
import os
print(pystan.__version__)
%matplotlib inline
import os
os.environ['STAN_NUM_THREADS'] = str(cpu_count())
os.environ['NUMEXPR_NUM_THREADS'] = str(cpu_count())
from IPython.display import HTML
HTML('''<script>
code_show_err=false;
function code_toggle_err() {
if (code_show_err){
$('div.output_stderr').hide();
} else {
$('div.output_stderr').show();
}
code_show_err = !code_show_err
}
$( document ).ready(code_toggle_err);
</script>
To toggle on/off output_stderr, click <a href="javascript:code_toggle_err()">here</a>.''')
# File locations
EXPERIMENT_1 = './dat/raw/Experiment1.csv'
EXPERIMENT_2 = './dat/raw/Experiment2.csv'
EXPERIMENT_3 = './dat/raw/Experiment3.csv'
# Output locations
FIGURE_OUTPUT = './out/figures/'
CHAIN_OUTPUT = './out/chains/'
MODEL_OUTPUT = './out/models/'
TABLE_OUTPUT = './out/tables/'
CLEAN_DATA_EXP1 = './dat/cleaned/clean_exp1.csv'
CLEAN_DATA_EXP2 = './dat/cleaned/clean_exp2.csv'
SIMULATION_DATA = './dat/cleaned/simulation.csv'
# Utility function for inv_logit
def ilogit(x):
return 1/(1+np.exp(-x))
for location in [FIGURE_OUTPUT, CHAIN_OUTPUT, MODEL_OUTPUT, TABLE_OUTPUT]:
if not os.path.exists(location):
os.makedirs(location)
print("created folder : ", location)
else:
print(location, "folder already exists.")
# Clean and save data.
clean_exp1_df = clean_exp_1(pd.read_csv(EXPERIMENT_1))
clean_exp1_df.to_csv(CLEAN_DATA_EXP1)
# Clean and save data.
clean_exp2_df = clean_exp_2(pd.read_csv(EXPERIMENT_2, skiprows=[1, 2]))
clean_exp2_df.to_csv(CLEAN_DATA_EXP2)
from src.demographics import demographics_exp_1, demographics_exp_2
# Plot demographic figures for SI
plt.figure(figsize=(6, 6))
dem1 = demographics_exp_1(pd.read_csv(EXPERIMENT_1))
plt.savefig(FIGURE_OUTPUT+'Exp1_demographics.pdf', fmt='pdf')
plt.figure(figsize=(6, 6))
dem2 = demographics_exp_2(pd.read_csv(CLEAN_DATA_EXP2))
plt.savefig(FIGURE_OUTPUT+'Exp2_demographics.pdf', fmt='pdf')
model_confidence_accuracy_logit = pystan.StanModel(
file='./src/confidence_accuracy_logit.stan')
# Get the data in the format the model wants to see it.
stan_data_logistic = format_stan_data_logistic(pd.read_csv(CLEAN_DATA_EXP1))
def plot_figure1b_prior(stan_data_logistic):
x = np.linspace(np.min(stan_data_logistic['x']),np.max(stan_data_logistic['x']),10)
plt.figure(figsize=(20, 10))
for kdx in range(10):
for idx in range(5):
plot_no = idx+1 + 5 * kdx
plt.subplot(5,10,plot_no)
alpha_p = np.random.normal(0,.4, 1)
beta_p = np.random.normal(0,.4, 1)
alpha = np.random.normal(alpha_p, .4, 100)
beta = np.random.normal(beta_p, .4, 100)
y = np.array([alpha + beta * item for item in x])
y = ilogit(y)
plt.plot(x,y, color='grey')
#plt.ylim(.2,.8)
#plt.xlim(50,100)
if plot_no%10 == 1:
plt.ylabel('Accuracy')
else:
plt.yticks([])
if plot_no >= 40:
plt.xlabel('Reported \nconfidence')
plt.xticks([])
plt.ylim(0,1)
plot_figure1b_prior(stan_data_logistic)
print('Sampling confidence-accuracy model...')
# Get the data in the format the model wants to see it.
stan_data_logistic = format_stan_data_logistic(pd.read_csv(CLEAN_DATA_EXP1))
# MCMC Time
samples_logistic = model_confidence_accuracy_logit.sampling(
data=stan_data_logistic)
# Extract samples
extracted_samples_logistic = samples_logistic.extract(['alpha_p', 'beta_p'])
# Save model and output for easy loading later.
pickle_model(model_confidence_accuracy_logit,
samples_logistic,
MODEL_OUTPUT,
CHAIN_OUTPUT,
'confidence_accuracy_logit')
# Plot chains
#samples_logistic.plot(pars=['alpha_p', 'beta_p'])
az.plot_trace(samples_logistic,
var_names=['alpha_p', 'beta_p'])
az.plot_forest(samples_logistic, var_names=[
'alpha_p', 'beta_p'], credible_interval=.89)
print('...sampling complete')
print('Saving results...',)
model_confidence_accuracy_logit, samples_logistic = load_model(MODEL_OUTPUT,
CHAIN_OUTPUT,
'confidence_accuracy_logit')
print("Generating and saving tables and figures for confidence-accuracy model...")
# Save key output as a LaTeX file.
variables = ['alpha_p', 'beta_p']
extracted_samples_logistic['beta_p']
latex_string = make_latex_table(extracted_samples_logistic, variables)
print(latex_string)
save_latex_table(TABLE_OUTPUT, 'confidence_accuracy_logit.tex', latex_string)
from src.exp_1_figures import plot_figure1a
exp_1_data = pd.read_csv(CLEAN_DATA_EXP1)
plt.savefig(FIGURE_OUTPUT+'joint_hpdi.pdf', fmt='pdf')
from src.exp_1_figures import joint_hpdi
plt.figure(figsize=(3,3))
sns.set_context('paper', font_scale=1.5)
joint_hpdi(extracted_samples_logistic)
from src.exp_1_figures import plot_figure1b
plt.figure(figsize=(3,3))
sns.set_context('paper', font_scale=1.5)
plot_figure1b(extracted_samples_logistic, stan_data_logistic)
az.plot_trace(samples_logistic, var_names=['accuracy_p'])
az.plot_forest(samples_logistic, var_names=['accuracy_p'],
credible_interval=.89)
extracted_samples_logistic = samples_logistic.extract(['accuracy_p'])
variables = ['accuracy_p']
extracted_samples_logistic['accuracy_p'] = ilogit(extracted_samples_logistic['accuracy_p'])
latex_string = make_latex_table(extracted_samples_logistic, variables)
print(latex_string)
save_latex_table(TABLE_OUTPUT, 'accuracy.tex', latex_string)
print("Complete")
print("Sampling agent model of confidence...")
#Simulate different political groups
correct = 1 #Change to zero to simulate when incorrect
for p in range(5):
plt.figure(figsize=(20,4))
mu_a_pol = np.random.normal(0,1)
alpha_a_pol = np.random.normal(-1,1)
gamma_a_pol = np.random.normal(0,1)
mu_b_pol = np.random.normal(0,.5)
alpha_b_pol = np.random.normal(0,.5)
gamma_b_pol = np.random.normal(0,.5)
#And Individuals
for k in range(20):
mu_a = np.random.normal(mu_a_pol,.5)
alpha_a = np.random.normal(alpha_a_pol,.5)
gamma_a = np.random.normal(gamma_a_pol,.5)
mu_b = np.random.normal(mu_b_pol,.5)
alpha_b = np.random.normal(alpha_b_pol,.5)
gamma_b = np.random.normal(gamma_b_pol,.5)
mu = np.zeros(100)
alpha = np.zeros(100)
gamma = np.zeros(100)
y_sim = np.zeros(100)
plt.subplot(2,10,k+1)
for i in range(100):
theta = abs(np.random.normal(0,5));
mu[i] = ilogit(mu_a + mu_b*correct);
alpha[i] = ilogit(alpha_a + alpha_b*correct);
gamma[i] = ilogit(gamma_a + gamma_b*correct);
if np.random.binomial(1, alpha[i])==1:
if np.random.binomial(1, gamma[i]) == 1:
y_sim[i] = 1
else:
y_sim[i] = 0
else:
y_sim[i] = np.random.beta(mu[i]*theta, (1-mu[i])*theta)
sns.histplot(y_sim,bins=np.linspace(0,1,20))
plt.xlim(0,1)
belief_model = pystan.StanModel(file='./src/belief_model.stan')
exp_1_data = pd.read_csv(CLEAN_DATA_EXP1)
from src.stan_data_format import format_stan_data_belief
stan_data = format_stan_data_belief(exp_1_data)
belief_samples = belief_model.sampling(data = stan_data)
pickle_model(belief_model,
belief_samples,
MODEL_OUTPUT,
CHAIN_OUTPUT,
'belief')
az.plot_trace(belief_samples,
var_names=['mu_a_pol',
'alpha_a_pol',
'gamma_a_pol',
'theta']);
az.plot_trace(belief_samples,
var_names=['mu_b_pol',
'alpha_b_pol',
'gamma_b_pol']);
print('Generating tables, figures and saving....')
extracted_belief_samples = belief_samples.extract(['mu_b_pol',
'alpha_b_pol',
'gamma_b_pol','mu_a_pol',
'alpha_a_pol',
'gamma_a_pol','theta'])
variables = ['alpha_a_pol', 'gamma_a_pol', 'mu_a_pol',
'alpha_b_pol', 'gamma_b_pol', 'mu_b_pol','theta' ]
# extracted_samples_logistic['accuracy_p'] = ilogit(extracted_samples_logistic['accuracy_p'])
latex_string = make_latex_table(extracted_belief_samples, variables)
print(latex_string)
save_latex_table(TABLE_OUTPUT, 'confidence_agent.tex', latex_string)
#Some of those chains look a bit autocorrelated but converged, so let's look at the ppd.
belief_model, belief_samples = load_model(MODEL_OUTPUT,
CHAIN_OUTPUT,
'belief')
belief_samps = belief_samples.extract()
from src.posterior_predictive import plot_predicted_vs_observed_belief_model
#plt.savefig('../Graphs/PredictedVsObserved.pdf', fmt='.pdf',dpi=1500)
exp_1_data = pd.read_csv(CLEAN_DATA_EXP1)
sns.set_context('paper',font_scale=1.5)
plt.figure(figsize=(3,3))
plot_predicted_vs_observed_belief_model(belief_samps, exp_1_data);
plt.tight_layout()
plt.savefig(FIGURE_OUTPUT+'PredctedVsObservedBelief.pdf', fmt='pdf')
from src.posterior_predictive import plot_belief_model_distributions
exp_1_data = pd.read_csv(CLEAN_DATA_EXP1)
sns.set_context()
plt.figure(figsize=(15,6))
plot_belief_model_distributions(exp_1_data,belief_samps)
plt.savefig(FIGURE_OUTPUT+'PosteriorPredictiveConfidenceDistributions.pdf', fmt='pdf')
print("Sampling agent model of social influence")
plt.figure(figsize=(10, 4))
for idx in range(20):
plt.subplot(4,5,idx+1)
alpha_p = np.random.normal(0,2);
b_conf_p = np.random.normal(0,2);
b_socConf_p = np.random.normal(0,2);
alpha = np.random.normal(alpha_p, 1, 100);
b_conf = np.random.normal(b_conf_p, 1, 100);
x = np.linspace(-.5,.5, 100)
y_sim = np.array([alpha + b_conf * item for item in x])
plt.plot(x,ilogit(y_sim),color='grey',alpha=.5)
plt.ylim(0,1)
switch_model = pystan.StanModel(file='./src/switch_model.stan')
from src.stan_data_format import format_stan_data_switch
exp_1_data = pd.read_csv(CLEAN_DATA_EXP1)
stan_model_data_correct, _= format_stan_data_switch(exp_1_data,correct=True)
switch_samples_correct = switch_model.sampling(data=stan_model_data_correct)
import arviz as av
#Extract samples
extracted_switch_samples_correct = switch_samples_correct.extract(['alpha_p',
'b_conf_p',
'b_socConf_p'])
#Save model and output for easy loading later.
pickle_model(switch_model,
switch_samples_correct,
MODEL_OUTPUT,
CHAIN_OUTPUT,
'switch_samples_correct');
#Plot chains
av.plot_trace(switch_samples_correct, var_names=['alpha_p',
'b_conf_p',
'b_socConf_p']);
print('Plotting Social influence figures, making tables and saving...')
from src.exp_1_figures import plot_fig1cd
stan_model_data_correct, df= format_stan_data_switch(exp_1_data,correct=True)
sns.set_context('paper', font_scale=1.5)
plt.figure(figsize=(3,3))
plot_fig1cd(stan_model_data_correct, df, extracted_switch_samples_correct)
plt.title('Initially \ncorrect')
#Make table
variables = ['alpha_p','b_conf_p','b_socConf_p']
latex_string = make_latex_table(extracted_switch_samples_correct, variables)
print(latex_string)
save_latex_table(TABLE_OUTPUT, 'extracted_switch_samples_correct.tex', latex_string)
from src.stan_data_format import format_stan_data_switch
exp_1_data = pd.read_csv(CLEAN_DATA_EXP1)
stan_model_data_incorrect, df = format_stan_data_switch(exp_1_data,correct=False)
switch_samples_incorrect = switch_model.sampling(data=stan_model_data_incorrect)
import arviz as av
#Extract samples
extracted_switch_samples_incorrect = switch_samples_incorrect.extract(['alpha_p',
'b_conf_p',
'b_socConf_p'])
#Save model and output for easy loading later.
pickle_model(switch_model,
switch_samples_incorrect,
MODEL_OUTPUT,
CHAIN_OUTPUT,
'switch_samples_incorrect');
#Plot chains
av.plot_trace(switch_samples_incorrect, var_names=['alpha_p',
'b_conf_p',
'b_socConf_p']);
from src.exp_1_figures import plot_fig1cd
stan_model_data_incorrect, df= format_stan_data_switch(exp_1_data,correct=False)
sns.set_context('paper', font_scale=1.5)
plt.figure(figsize=(3,3))
plot_fig1cd(stan_model_data_incorrect, df, extracted_switch_samples_incorrect, correct=False)
plt.title('Initially \nincorrect')
#Make table
from src.utils import make_latex_table
variables = ['alpha_p','b_conf_p','b_socConf_p']
latex_string = make_latex_table(extracted_switch_samples_incorrect, variables)
print(latex_string)
save_latex_table(TABLE_OUTPUT, 'extracted_switch_samples_incorrect.tex', latex_string)
from src.exp_1_figures import plot_switch_predicted_acuracy
plt.figure(figsize=(8,4))
plt.subplot(121)
stan_model_data_incorrect, df= format_stan_data_switch(exp_1_data,correct=True)
plot_switch_predicted_acuracy(df, switch_samples_correct, correct=True)
plt.title('Initially \ncorrect')
ax=plt.gca()
ax.text(-0.1, 1.1, 'A', transform=ax.transAxes,
size=20, weight='bold')
plt.subplot(122)
stan_model_data_incorrect, df= format_stan_data_switch(exp_1_data,correct=False)
plot_switch_predicted_acuracy(df, switch_samples_incorrect, correct=False)
plt.title('Initially \nincorrect')
ax=plt.gca()
ax.text(-0.1, 1.1, 'B', transform=ax.transAxes,
size=20, weight='bold')
plt.tight_layout()
plt.savefig(FIGURE_OUTPUT+'PredctedVsObservedSwitch.pdf', fmt='pdf')
print("Generating figure 1")
from src.exp_1_figures import plot_figure1a
from src.exp_1_figures import plot_figure1b
sns.set_context('paper', font_scale=1.5)
plt.figure(figsize=(7,7))
plt.subplot(221)
exp_1_data = pd.read_csv(CLEAN_DATA_EXP1)
plot_figure1a(exp_1_data[exp_1_data
.answer==True],exp_1_data[exp_1_data.answer==False])
ax=plt.gca()
ax.text(-0.1, 1.1, 'A', transform=ax.transAxes,
size=20, weight='bold')
plt.tight_layout()
from src.exp_1_figures import plot_figure1a
from src.exp_1_figures import plot_figure1b
sns.set_context('paper', font_scale=1.5)
plt.figure(figsize=(7,7))
plt.subplot(221)
exp_1_data = pd.read_csv(CLEAN_DATA_EXP1)
plot_figure1a(exp_1_data[exp_1_data
.answer==True],exp_1_data[exp_1_data.answer==False])
ax=plt.gca()
ax.text(-0.1, 1.1, 'A', transform=ax.transAxes,
size=20, weight='bold')
plt.tight_layout()
plt.subplot(222)
sns.set_context('paper', font_scale=1.4)
model_confidence_accuracy_logit, samples_logistic = load_model(MODEL_OUTPUT,
CHAIN_OUTPUT,
'confidence_accuracy_logit')
#Extract samples
extracted_samples_logistic = samples_logistic.extract(['alpha_p', 'beta_p'])
#Get the data in the format the model wants to see it.
stan_data_logistic = format_stan_data_logistic(pd.read_csv(CLEAN_DATA_EXP1))
plot_figure1b(extracted_samples_logistic, stan_data_logistic)
ax=plt.gca()
ax.text(-0.1, 1.1, 'B', transform=ax.transAxes,
size=20, weight='bold')
plt.tight_layout()
plt.subplot(223)
exp_1_data = pd.read_csv(CLEAN_DATA_EXP1)
stan_model_data_incorrect, df = format_stan_data_switch(exp_1_data,correct=True)
plot_fig1cd(stan_model_data_correct, df, extracted_switch_samples_correct)
plt.title('Initially \ncorrect')
ax=plt.gca()
ax.text(-0.1, 1.1, 'C', transform=ax.transAxes,
size=20, weight='bold')
plt.tight_layout()
plt.subplot(224)
exp_1_data = pd.read_csv(CLEAN_DATA_EXP1)
stan_model_data_incorrect, df = format_stan_data_switch(exp_1_data,correct=False)
plot_fig1cd(stan_model_data_incorrect, df, extracted_switch_samples_incorrect,correct=False)
plt.title('Initially \nincorrect')
ax=plt.gca()
ax.text(-0.1, 1.1, 'D', transform=ax.transAxes,
size=20, weight='bold')
plt.tight_layout()
plt.savefig(FIGURE_OUTPUT+'Figure1.pdf', fmt='pdf')
switch_samples_correct_model, switch_samples_correct = load_model(MODEL_OUTPUT,
CHAIN_OUTPUT,
'switch_samples_correct')
extracted_switch_samples_correct = switch_samples_correct.extract(['alpha_p',
'b_conf_p',
'b_socConf_p'])
switch_samples_incorrect_model, switch_samples_incorrect = load_model(MODEL_OUTPUT,
CHAIN_OUTPUT,
'switch_samples_incorrect')
extracted_switch_samples_incorrect = switch_samples_incorrect.extract(['alpha_p',
'b_conf_p',
'b_socConf_p'])
belief_model, belief_samples = load_model(MODEL_OUTPUT,
CHAIN_OUTPUT,
'belief')
extracted_belief_samples = belief_samples.extract()
print('Running simulations... grab a cup of coffee.')
from pandarallel import pandarallel
import itertools as it
from src.simulation_study import run_single
from tqdm.auto import tqdm
from multiprocessing import cpu_count
tqdm.pandas(desc="my bar!")
pandarallel.initialize(nb_workers=cpu_count(), progress_bar=True)
dat_dict ={'proportions':[[75,50,30,50,75]],
'p':[.5,.75,.98],
'diff':np.linspace(.40,.6,20),
'repeat':np.arange(2000),
'N':[500],
'lean':np.array(['right','left',False])}
def get_combinations_dataframe(dd):
allNames = sorted(dd)
combinations = it.product(*(dd[Name] for Name in allNames))
dat = pd.DataFrame(list(combinations), columns=allNames)
return(dat)
df = get_combinations_dataframe(dat_dict)
run_row = lambda dat : run_single(dat['p'],
dat['diff'],
dat['N'],
dat['proportions'],
dat['lean'],
extracted_belief_samples,
extracted_switch_samples_incorrect,
extracted_switch_samples_correct)
simulation_results = df.parallel_apply(run_row, axis=1)
# df['correct_final'] = np.array(np.hstack(simulation_results))
sim_results = np.vstack(simulation_results)
df['correct_final'] = sim_results[:,0]
df['correct_start'] = sim_results[:,0]
df.to_csv(SIMULATION_DATA)
simulation_results = pd.read_csv(SIMULATION_DATA)
simulation_results['total'] = np.ones(simulation_results.shape[0])
simulation_results['correct_majority'] = simulation_results['correct_final'] > .5
simulation_results.head()
simulation_results.shape
from scipy import stats
def get_bionomial_ci(n=100,k=200,res=1000,q = [5.5, 94.5]):
p_grid = np.linspace(0,1,res)
likelihood = stats.binom(k, p_grid).pmf(n)
prior = np.ones(res)
posterior=likelihood*prior
posterior = posterior/np.sum(posterior)
samples = np.random.choice(p_grid, p=posterior,size=res)
return(np.percentile(samples, q=q))
def plot_simulation_results(temp,N=500):
grouped = temp.groupby(['p', 'lean','diff']).sum().reset_index()
grouped.head()
#plt.scatter(grouped['diff'], grouped['correct_majority'])
samples_per_difficulty = grouped['total'].values[0]
ci = np.array([get_bionomial_ci(n=item,k=samples_per_difficulty) for item in grouped['correct_majority'].values])
pal = sns.color_palette("Greens", n_colors=5)
grouped['lower'] = ci[:,0]
grouped['upper'] = ci[:,1]
ps = [.5,.75,.98]
for idx in range(3):
temp = grouped[grouped.p==ps[idx]]
t1 = temp.groupby(['diff']).mean().reset_index()
plt.plot(t1['diff'].values,
t1['correct_majority'].values/samples_per_difficulty,
color=pal[idx+2])
plt.fill_between(t1['diff'].values, t1['lower'], t1['upper'],color=pal[idx+2],alpha=.3)
head_length = .05
plt.plot(t1['diff'].values,
1-stats.binom(N,t1['diff'].values).cdf(N/2),
ls='--',
color='grey',lw=2)
plt.ylabel('Collective Accuracy')
plt.xlabel('Initial proportion correct')
def add_arrow(dat):
temp1 = dat[dat.p==.98]
t1 = temp1.groupby(['diff']).mean().reset_index()
temp2 = dat[dat.p==.5]
t2 = temp2.groupby(['diff']).mean().reset_index()
samples_per_difficulty = t2['total'].values[0]
diff = t1['correct_majority']/samples_per_difficulty - t2['correct_majority']/samples_per_difficulty
maxdiff = np.argmax(np.abs(diff))
print(t2['diff'][maxdiff])
ax = plt.gca()
print(t1.shape)
head_length = .05
ax.arrow(t2['diff'][maxdiff],
t2['correct_majority'][maxdiff]/samples_per_difficulty,
0,
t1['correct_majority'][maxdiff]/samples_per_difficulty- \
t2['correct_majority'][maxdiff]/samples_per_difficulty + \
head_length,
head_width=0.015, head_length=head_length, fc='grey', ec='grey')
plt.figure(figsize=(6,9))
plt.subplot(311)
fig = plot_simulation_results(simulation_results[simulation_results.lean=='False'])
plt.tight_layout()
plt.subplot(312)
fig = plot_simulation_results(simulation_results[simulation_results.lean=='left'])
plt.tight_layout()
plt.subplot(313)
fig = plot_simulation_results(simulation_results[simulation_results.lean=='right'])
add_arrow(simulation_results[simulation_results.lean=='right'])
plt.tight_layout()
print('Sampling model for Experiment 2')
alpha = np.random.normal(0,.2,1000);
y = np.random.binomial(100, ilogit(alpha));
sns.distplot(y)
plt.figure()
def format_stan_data_exp2(exp_2_data):
false_exp2_data = exp_2_data[exp_2_data.answer==False]
false_exp2_data['count'] = np.ones(false_exp2_data.shape[0])
N = false_exp2_data.shape[0]
p = 1*(false_exp2_data.p_recode.values)
cond = false_exp2_data['cond_recode'].values+1
y = 1*(false_exp2_data['social_correct'].values)
false_exp2_data['cond_recode']
pd.Categorical(false_exp2_data['cond_recode'])
grouped_exp2 = false_exp2_data.groupby(['state', 'cond_recode', 'p_recode']).sum().reset_index()
state_data = dict(y=grouped_exp2['social_correct'].astype('int').values,
N = grouped_exp2.shape[0],
count=grouped_exp2['count'].astype('int').values,
cond = grouped_exp2['p_recode'].values *3 + \
grouped_exp2['cond_recode'].astype('int').values+1)
return state_data, false_exp2_data
stan_data_exp2, exp2_data = format_stan_data_exp2(pd.read_csv(CLEAN_DATA_EXP2))
#from src.stan_data_format import format_stan_data_exp2
stan_data_exp2, exp2_data = format_stan_data_exp2(pd.read_csv(CLEAN_DATA_EXP2))
exp2_model = pystan.StanModel(file='./src/experiment_2_validation.stan')
exp2_samples = exp2_model.sampling(data=stan_data_exp2)
exp2_samples
az.plot_trace(exp2_samples);
print('Plotting and saving figures, table...')
extracted_exp2 = exp2_samples.extract()
def plot_fig2F(exp2_samples):
pal = sns.diverging_palette(10, 220, sep=80, n=3,l=40,center='light')
pal2 = sns.diverging_palette(10, 220, sep=80, n=3,l=40,center='dark')
pal[1] = pal2[1]
pal_order = [2,1,0]
for idx in range(3):
sns.kdeplot(ilogit(extracted_exp2['alpha'][:,3+idx])-ilogit(extracted_exp2['alpha'][:,idx]),
shade=True,
color=pal[pal_order[idx]])
print(np.mean(ilogit(extracted_exp2['alpha'][:,3+idx])-ilogit(extracted_exp2['alpha'][:,idx])))
print(np.percentile(ilogit(extracted_exp2['alpha'][:,3+idx])-ilogit(extracted_exp2['alpha'][:,idx]), q=[5.5,94.5]))
plt.xlabel('Impact of homophily\non accuracy')
plt.ylabel('Density')
plot_fig2F(extracted_exp2)
variables = ['logit_alpha']
extracted_exp2['logit_alpha'] = ilogit(extracted_exp2['alpha'])
latex_string = make_latex_table(extracted_exp2, variables)
print(latex_string)
save_latex_table(TABLE_OUTPUT, 'experiment2.tex', latex_string)
from PIL import Image
def make_square(im, min_size=1000, fill_color=(0, 0, 0, 0)):
x, y = im.size
size = max(min_size, x, y)
new_im = Image.new('RGBA', (size, size), fill_color)
new_im.paste(im, (int((size - x) / 2), int((size - y) / 2)))
return new_im
simulation_results = pd.read_csv(SIMULATION_DATA)
simulation_results['total'] = np.ones(simulation_results.shape[0])
simulation_results['correct_majority'] = simulation_results['correct_final'] > .5
simulation_results.head()
from src.exp_2_figures import plot_fig2E, plot_fig2F
sns.set_context('paper', font_scale=1.3)
plt.subplots(3,2,figsize=(7,10.5))
plt.subplot(322)
fig = plot_simulation_results(simulation_results[simulation_results.lean=='False'])
ax = plt.gca()
ax.text(-0.1, 1.1, 'B', transform=ax.transAxes,
size=20, weight='bold')
plt.subplot(321)
test_image = Image.open('./dat/network_img.png')
new_image = make_square(test_image)
plt.imshow(new_image)
ax = plt.gca()
ax.axis('off')
ax.text(-0.1, 1.1, 'A', transform=ax.transAxes,
size=20, weight='bold')
plt.tight_layout()
plt.subplot(323)
ax = plt.gca()
ax.text(-0.1, 1.1, 'C', transform=ax.transAxes,
size=20, weight='bold')
fig = plot_simulation_results(simulation_results[simulation_results.lean=='left'])
plt.tight_layout()
plt.subplot(324)
ax = plt.gca()
ax.text(-0.1, 1.1, 'D', transform=ax.transAxes,
size=20, weight='bold')
fig = plot_simulation_results(simulation_results[simulation_results.lean=='right'])
add_arrow(simulation_results[simulation_results.lean=='right'])
plt.subplot(3,2,5)
ax = plt.gca()
ax.text(-0.1, 1.1, 'E', transform=ax.transAxes,
size=20, weight='bold')
stan_data_exp2, exp2_data = format_stan_data_exp2(pd.read_csv(CLEAN_DATA_EXP2))
plot_fig2E(exp2_data, exp2_samples)
plt.subplot(3,2,6)
ax = plt.gca()
ax.text(-0.1, 1.1, 'F', transform=ax.transAxes,
size=20, weight='bold')
plot_fig2F(exp2_samples)
plt.savefig(FIGURE_OUTPUT+'Figure2.pdf',dpi=1500, fmt='pdf')
print('Computing conceptual model.....')
def fmt(x, pos):
a, b = '{:.2e}'.format(x).split('e')
b = int(b)
return r'${} \times 10^{{{}}}$'.format(a, b)
def get_prob(q1, q2, p, a11, g11, a22, g22, a21, g21, a12, g12):
bb = p*(q1*(1-q1)*(a11-g11)) + (1-p) * (q2*a21*(1-q1)-g21*q1*(1-q2))
cc = p*(q2*(1-q2)*(a22-g22)) + (1-p) * (q1*a12*(1-q2)-g12*q2*(1-q1))
return (q1+q2+bb+cc)/2
#1=Libera
a11 = .25
g11 = .2
a12 = .2
g12 = .2
#2=Conservative
a22 = .2
g22 = .2
a21 = .25
g21 = .2
p = np.linspace(0.5, 1.0, 500)
q = np.linspace(0,1,500)
X, Y = np.meshgrid(q, p)
sns.set_context('paper', font_scale=1.5)
sns.set_palette(sns.diverging_palette(100, 280, s=85, l=25, n=20))
Z = get_prob(X, (1-X), Y, a11, g11, a22, g22, a21, g21, a12, g12) - \
get_prob(X, (1-X), .5, a11, g11, a22, g22, a21, g21, a12, g12)
fig,ax=plt.subplots(1,1)
cp = ax.contourf(X, Y, Z*100,levels=np.linspace(-1.5, 1.5, 41),cmap=plt.get_cmap('BrBG'))
cb = fig.colorbar(cp,ticks=[-1.5,-1,-.5,0,.5,1.0,1.5]) # Add a colorbar to a plot
cb.ax.set_title(r'$\times 10^{-2}$')
cb.set_label('Change in accuracy')
cb.ax.set_yticklabels(['-1.0','-1.0','-0.5',' 0.0',' 0.5', ' 1.0', ' 1.5'])
ax.set_xlabel('$\dfrac{q_1}{q_1+q_2}$')
ax.set_ylabel('h')
plt.plot([.5,.5], [.5,1], ls='--', color='k')
plt.tight_layout()
ax.text(0.3, .1, 'Conservative \ncorrect', transform=ax.transAxes,
size=12, weight='bold',horizontalalignment='center')
ax.text(0.7, .1, 'Liberal \ncorrect', transform=ax.transAxes,
size=12, weight='bold',horizontalalignment='center')
plt.savefig(FIGURE_OUTPUT+'Figure3.pdf',dpi=1500, fmt='pdf')
EXPERIMENT_3 = './dat/raw/Experiment3.csv'
data, melted = clean_exp3(pd.read_csv(EXPERIMENT_3))
melted.head()
from src.exp_3_figures import plot_exp3_demographics
plot_exp3_demographics(data)
plt.savefig('./out/figures/Experiment3Demographics.pdf',transparent=True)
print('Sampling experiment 3 model...')
temp_model = pystan.StanModel(file='./src/exp3_cchbm.stan')
from src.stan_data_format import format_stan_data_exp3
exp3_stan_df = format_stan_data_exp3(melted)
prior_samples = temp_model.sampling(data=exp3_stan_df, algorithm='Fixed_param')
#Fit MLE estimates for regressions from the prior predictive.
#It's a bit loose, but not out of line with what we saw in exp1, and we want
#a lot of relationship flexibilty for questions and subjects. seems reasonable.
from sklearn.linear_model import LogisticRegression
for idx in range(100):
temp = melted.copy()
chain = np.random.choice(4000)
y_prior = prior_samples['y_prior'][chain,:]
temp['y_prior'] = y_prior
for idx in range(5):
t = temp[temp['pol_recode']==idx]
x = t['conf'].values
clf = LogisticRegression(random_state=0).fit(x.reshape(-1, 1), t['y_prior'])
xtemp = np.linspace(.5,1,100).reshape(-1, 1)
ytemp = clf.predict_proba(xtemp)
plt.plot(xtemp,ytemp,alpha=.1)
plt.title('Prior Predictive')
plt.xlabel("Confidence")
plt.ylabel('Accuracy')
plt.ylim(0,1)
plt.xlim(.5,1)
exp3_model = pystan.StanModel(
file='./src/exp3_cchbm.stan')
from src.stan_data_format import format_stan_data_exp3
exp3_stan_df = format_stan_data_exp3(melted)
cchbml_samples = exp3_model.sampling(data=exp3_stan_df,)
from src.exp_3_figures import plot_posterior_predictive_exp3
plot_posterior_predictive_exp3(melted, cchbml_samples)
plt.xlabel('Predicted accuracy')
plt.ylabel('Observed acccuracy')
plt.tight_layout()
plt.savefig('./out/figures/posterior_predictive_exp3.pdf')
az.summary(cchbml_samples, var_names=['alpha','beta','beta_q','gamma']).sort_values('r_hat',ascending=False)
az.summary(cchbml_samples,var_names=['beta_q','beta_s','gamma']).sort_values('r_hat',ascending=False).head()
az.plot_forest(cchbml_samples,var_names=['beta_s'])
az.plot_forest(cchbml_samples,var_names=['gamma'])
az.summary(cchbml_samples,var_names=['L_Omega_q']).sort_values('r_hat',ascending=False)
print("Effect of confidence (VC-VL)")
print(np.mean(cchbml_samples['gamma'][:,1,0] - cchbml_samples['gamma'][:,1,4]))
print("89% CI")
print(np.percentile(cchbml_samples['gamma'][:,1,0] - cchbml_samples['gamma'][:,1,4], q=[5.5,94.5]))
az.summary(cchbml_samples,var_names=['qm']).sort_values('r_hat',ascending=False)
print('Plotting figure 3, generating tables, and saving everything....')
def plot_exp3_incidental(melted, samples):
pal = sns.diverging_palette(10, 220, sep=80, n=5,l=40,center='light')
pal2 = sns.diverging_palette(10, 220, sep=80, n=5,l=40,center='dark')
pal[2] = pal2[2]
order = melted.groupby(['question_code']).mean()['correct'].argsort()
q_names = []
for item in melted.groupby(['question_code']):
q_names.append(item[1]['question'][0])
plt.figure(figsize=(6,8))
plt.subplot(121)
for yp,idx in enumerate(order):
for jdx in range(5):
temp = np.mean(samples['beta_q'][:,jdx+5,idx],axis=0)
ci = np.percentile(samples['beta_q'][:,jdx+5,idx],q=[5.5,94.5])
plt.scatter(temp,3*(yp+((jdx-3)*.1)),color=pal[jdx],alpha=.3)
plt.plot([ci[0], ci[1]], [3*(yp+((jdx-3)*.1)),3*(yp+((jdx-3)*.1))],color=pal[jdx],alpha=.7)
plt.yticks(np.arange(order.size)*3, np.array(q_names)[order])
#plt.ylabel('alpha')
plt.xlabel('Effect of confidence \nby question')
plt.plot([0,0],[-1,3*np.max(order)+1],ls='--',color='k')
plt.ylim(-2, 3*np.max(order)+3)
plt.subplot(122)
jdx = 2
for yp,idx in enumerate(order):
temp1 = samples['beta_q'][:,5,idx]
temp2 = samples['beta_q'][:,-1,idx]
ci = np.percentile(temp1-temp2,q=[5.5,94.5])
plt.scatter(np.mean(temp1-temp2),3*(yp+((jdx-3)*.1)),color='k',alpha=.3)
plt.plot([ci[0], ci[1]], [3*(yp+((jdx-3)*.1)),3*(yp+((jdx-3)*.1))],color='k',alpha=.7)
#plt.yticks(np.arange(order.size)*3, np.array(q_names)[order])
#plt.ylabel('alpha')
plt.yticks([])
plt.xlabel('VC-VL effect of \nconfidence')
plt.plot([0,0],[-1,3*np.max(order)+1],ls='--',color='k')
plt.ylim(-2, 3*np.max(order)+3)
plot_exp3_incidental(melted, cchbml_samples)
plt.tight_layout()
plt.savefig('./out/figures/figure4.pdf')
q=[5.5,50.0,94.5]
means = np.mean(cchbml_samples['gamma'][:,0,:],axis=0)
qs = np.percentile(cchbml_samples['gamma'][:,0,:],axis=0,q=[5.5,50.0,94.5])
sigma = np.std(cchbml_samples['gamma'][:,0,:],axis=0)
var=['$\\gamma_{0,'+idx+'}$' for idx in ['VC','C','M','L','VL']]
gamma_tx = pd.DataFrame({'Variable':var,
'Mean':means,
'sd':sigma,
str(q[0]) + '%':qs[0],
str(q[1]) + '%':qs[1],
str(q[2]) + '%':qs[2]})
q=[5.5,50.0,94.5]
means = np.mean(cchbml_samples['gamma'][:,1,:],axis=0)
qs = np.percentile(cchbml_samples['gamma'][:,1,:],axis=0,q=[5.5,50.0,94.5])
sigma = np.std(cchbml_samples['gamma'][:,1,:],axis=0)
var=['$\gamma_{1,'+idx+'}$' for idx in ['VC','C','M','L','VL']]
gamma_tx2 = pd.DataFrame({'Variable':var,
'Mean':means,
'sd':sigma,
str(q[0]) + '%':qs[0],
str(q[1]) + '%':qs[1],
str(q[2]) + '%':qs[2]})
pd.concat([gamma_tx, gamma_tx2]).to_latex('./out/tables/Exp3GammaModel1.tex',index=False,escape=False)
gamma_tx2
q=[5.5,50.0,94.5]
means = np.mean(np.mean(cchbml_samples['beta_q'][:,0:5,:],axis=0),axis=0)
qs = np.percentile(cchbml_samples['beta_q'][0,0:5,:],axis=0,q=[5.5,50.0,94.5])
qs[0].shape
questions = melted.groupby('question').mean().reset_index().sort_values('question_code')['question']
effects = np.mean(np.mean(cchbml_samples['beta_q'][:,5:,:],axis=0),axis=0)
effect_qs = np.percentile(cchbml_samples['beta_q'][0,5:,:],axis=0,q=[5.5,50.0,94.5])
df1 = pd.DataFrame({'Question':questions.values,'$B_\text{q,INTCP}$':means,
str(q[0]) + '%':qs[0],
str(q[1]) + '%':qs[1],
str(q[2]) + '%':qs[2],
'$B_\text{q,CONF}$':effects,
str(q[0]) + '% ':effect_qs[0],
str(q[1]) + '% ':effect_qs[1],
str(q[2]) + '% ':effect_qs[2]})
df1.to_latex('./out/tables/questions_posterior.tex',index=False)
print('Analyzing cognitive battery....')
exp3_stan_df_cb = format_stan_data_exp3(melted,with_cb=True)
cbsamples = exp3_model.sampling(data=exp3_stan_df_cb)
az.summary(cbsamples, var_names=['alpha','beta','beta_q','gamma']).sort_values('r_hat',ascending=False)
az.summary(cbsamples,var_names=['beta_q','beta_s','gamma']).sort_values('r_hat',ascending=False).head()
az.plot_forest(cbsamples,var_names=['beta_s'])
az.plot_forest(cbsamples,var_names=['gamma'])
question_codes = pd.read_csv('./dat/Exp3questions.csv',names=['Code', 'Text'])
question_codes.sort_values('Code').to_latex('./out/tables/exp3Questions.tex',index=False)
q=[5.5,50.0,94.5]
means = np.mean(np.mean(cbsamples['beta_q'][:,0:5,:],axis=0),axis=0)
qs = np.percentile(cbsamples['beta_q'][0,0:5,:],axis=0,q=[5.5,50.0,94.5])
qs[0].shape
questions = melted.groupby('question').mean().reset_index().sort_values('question_code')['question']
effects = np.mean(np.mean(cbsamples['beta_q'][:,5:,:],axis=0),axis=0)
effect_qs = np.percentile(cbsamples['beta_q'][0,5:,:],axis=0,q=[5.5,50.0,94.5])
df1 = pd.DataFrame({'Question':questions.values,'$B_\text{q,INTCP}$':means,
str(q[0]) + '%':qs[0],
str(q[1]) + '%':qs[1],
str(q[2]) + '%':qs[2],
'$B_\text{q,CONF}$':effects,
str(q[0]) + '% ':effect_qs[0],
str(q[1]) + '% ':effect_qs[1],
str(q[2]) + '% ':effect_qs[2]})
az.summary(cbsamples,var_names=['gamma'])
q=[5.5,50.0,94.5]
means = np.hstack([np.mean(cbsamples['gamma'][:,0,:],axis=0),
np.mean(cbsamples['gamma'][:,1,:],axis=0)])
sd = np.hstack([np.std(cbsamples['gamma'][:,0,:],axis=0),
np.std(cbsamples['gamma'][:,1,:],axis=0)])
cis = np.hstack([np.percentile(cbsamples['gamma'][:,0,:],axis=0,q=q),
np.percentile(cbsamples['gamma'][:,1,:],axis=0,q=q)])
names = ['Bullshit Sensitivity','Wordsums',
'Need for Cognition','Heuristics and Biases','Numeracy','Faith in Intuition']
df_cb = pd.DataFrame({'Trait':names + names,
'Variable':np.hstack([np.repeat('Intcp',6), np.repeat('Conf',6)]),
'Mean':means,
'sd':sd,
str(q[0]) + '%':cis[0],
str(q[1]) + '%':cis[1],
str(q[2]) + '%':cis[2]})
df_cb.to_latex('./out/tables/cb.tex',index=False)
df_cb
az.summary(cbsamples,var_names=['qm']).sort_values('r_hat',ascending=False)
print('All done... easy eh?')